{
  "cells": [
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "DuN2KIZUlzbS"
      },
      "outputs": [],
      "source": [
        "# Copyright 2024 Google LLC\n",
        "#\n",
        "# Licensed under the Apache License, Version 2.0 (the \"License\");\n",
        "# you may not use this file except in compliance with the License.\n",
        "# You may obtain a copy of the License at\n",
        "#\n",
        "#     https://www.apache.org/licenses/LICENSE-2.0\n",
        "#\n",
        "# Unless required by applicable law or agreed to in writing, software\n",
        "# distributed under the License is distributed on an \"AS IS\" BASIS,\n",
        "# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n",
        "# See the License for the specific language governing permissions and\n",
        "# limitations under the License."
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "q3pW20ECIDpX"
      },
      "source": [
        "# Parsing and Chunking in Vertex AI Search: Featuring BYO Capabilities\n",
        "<table align=\"left\">\n",
        "  <td style=\"text-align: center\">\n",
        "    <a href=\"https://art-analytics.appspot.com/r.html?uaid=G-FHXEFWTT4E&utm_source=aRT-vais-building-blocks&utm_medium=aRT-clicks&utm_campaign=vais-building-blocks&destination=vais-building-blocks&url=https%3A%2F%2Fcolab.research.google.com%2Fgithub%2FGoogleCloudPlatform%2Fapplied-ai-engineering-samples%2Fblob%2Fmain%2Fsearch%2Fvais-building-blocks%2Fparsing_and_chunking_with_BYO.ipynb\">\n",
        "      <img width=\"32px\" src=\"https://www.gstatic.com/pantheon/images/bigquery/welcome_page/colab-logo.svg\" alt=\"Google Colaboratory logo\"><br> Open in Colab\n",
        "    </a>\n",
        "  </td>\n",
        "  <td style=\"text-align: center\">\n",
        "    <a href=\"https://art-analytics.appspot.com/r.html?uaid=G-FHXEFWTT4E&utm_source=aRT-vais-building-blocks&utm_medium=aRT-clicks&utm_campaign=vais-building-blocks&destination=vais-building-blocks&url=https%3A%2F%2Fconsole.cloud.google.com%2Fvertex-ai%2Fcolab%2Fimport%2Fhttps%3A%252F%252Fraw.githubusercontent.com%252FGoogleCloudPlatform%252Fapplied-ai-engineering-samples%252Fmain%252Fsearch%252Fvais-building-blocks%252Fparsing_and_chunking_with_BYO.ipynb\">\n",
        "      <img width=\"32px\" src=\"https://lh3.googleusercontent.com/JmcxdQi-qOpctIvWKgPtrzZdJJK-J3sWE1RsfjZNwshCFgE_9fULcNpuXYTilIR2hjwN\" alt=\"Google Cloud Colab Enterprise logo\"><br> Open in Colab Enterprise\n",
        "    </a>\n",
        "  </td>\n",
        "  <td style=\"text-align: center\">\n",
        "    <a href=\"https://art-analytics.appspot.com/r.html?uaid=G-FHXEFWTT4E&utm_source=aRT-vais-building-blocks&utm_medium=aRT-clicks&utm_campaign=vais-building-blocks&destination=vais-building-blocks&url=https%3A%2F%2Fconsole.cloud.google.com%2Fvertex-ai%2Fworkbench%2Fdeploy-notebook%3Fdownload_url%3Dhttps%3A%2F%2Fraw.githubusercontent.com%2FGoogleCloudPlatform%2Fapplied-ai-engineering-samples%2Fmain%2Fsearch%2Fvais-building-blocks%2Fparsing_and_chunking_with_BYO.ipynb\">\n",
        "      <img src=\"https://www.gstatic.com/images/branding/gcpiconscolors/vertexai/v1/32px.svg\" alt=\"Vertex AI logo\"><br> Open in Vertex AI Workbench\n",
        "    </a>\n",
        "  </td>\n",
        "  <td style=\"text-align: center\">\n",
        "    <a href=\"https://github.com/GoogleCloudPlatform/applied-ai-engineering-samples/blob/main/search/vais-building-blocks/parsing_and_chunking_with_BYO.ipynb\">\n",
        "      <img width=\"32px\" src=\"https://upload.wikimedia.org/wikipedia/commons/9/91/Octicons-mark-github.svg\" alt=\"GitHub logo\"><br> View on GitHub\n",
        "    </a>\n",
        "  </td>\n",
        "</table>\n",
        "\n",
        "<div style=\"clear: both;\"></div>\n",
        "\n",
        "<b>Share to:</b>\n",
        "\n",
        "<a href=\"https://www.linkedin.com/sharing/share-offsite/?url=https%3A//github.com/GoogleCloudPlatform/generative-ai/blob/main/search/vais-building-blocks/parsing_and_chunking_with_byo.ipynb\" target=\"_blank\">\n",
        "  <img width=\"20px\" src=\"https://upload.wikimedia.org/wikipedia/commons/8/81/LinkedIn_icon.svg\" alt=\"LinkedIn logo\">\n",
        "</a>\n",
        "\n",
        "<a href=\"https://bsky.app/intent/compose?text=https%3A//github.com/GoogleCloudPlatform/generative-ai/blob/main/search/vais-building-blocks/parsing_and_chunking_with_byo.ipynb\" target=\"_blank\">\n",
        "  <img width=\"20px\" src=\"https://upload.wikimedia.org/wikipedia/commons/7/7a/Bluesky_Logo.svg\" alt=\"Bluesky logo\">\n",
        "</a>\n",
        "\n",
        "<a href=\"https://twitter.com/intent/tweet?url=https%3A//github.com/GoogleCloudPlatform/generative-ai/blob/main/search/vais-building-blocks/parsing_and_chunking_with_byo.ipynb\" target=\"_blank\">\n",
        "  <img width=\"20px\" src=\"https://upload.wikimedia.org/wikipedia/commons/5/53/X_logo_2023_original.svg\" alt=\"X logo\">\n",
        "</a>\n",
        "\n",
        "<a href=\"https://reddit.com/submit?url=https%3A//github.com/GoogleCloudPlatform/generative-ai/blob/main/search/vais-building-blocks/parsing_and_chunking_with_byo.ipynb\" target=\"_blank\">\n",
        "  <img width=\"20px\" src=\"https://redditinc.com/hubfs/Reddit%20Inc/Brand/Reddit_Logo.png\" alt=\"Reddit logo\">\n",
        "</a>\n",
        "\n",
        "<a href=\"https://www.facebook.com/sharer/sharer.php?u=https%3A//github.com/GoogleCloudPlatform/generative-ai/blob/main/search/vais-building-blocks/parsing_and_chunking_with_byo.ipynb\" target=\"_blank\">\n",
        "  <img width=\"20px\" src=\"https://upload.wikimedia.org/wikipedia/commons/5/51/Facebook_f_logo_%282019%29.svg\" alt=\"Facebook logo\">\n",
        "</a>"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "YQpT9IS3K-Fn"
      },
      "source": [
        "| | |\n",
        "|----------|-------------|\n",
        "| Author(s)   | Jaival Desai, Hossein Mansour|\n",
        "| Reviewers(s) | Allie Chen, Rajesh Thallam|\n",
        "| Last updated | 2024-08-08: The first draft |"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "zNu-9XmEDF52"
      },
      "source": [
        "# Overview\n",
        "\n",
        "In this notebook, we will demonstrate how to retrieve Parsed and Chunked documents from a [Vertex AI Search](https://cloud.google.com/generative-ai-app-builder/docs/introduction) (VAIS) datastore. Additionally, we will show how to Bring Your Own Chunks (BYOC) and ingest them into the datastore as needed. You can find more information [here](https://cloud.google.com/generative-ai-app-builder/docs/parse-chunk-documents#bring-parsed-document).\n",
        "\n",
        "We will perform the following steps:\n",
        "\n",
        "- [Prerequisite] Create a VAIS Datastore and import sample documents\n",
        "- Get [Processed Document](https://cloud.google.com/generative-ai-app-builder/docs/parse-chunk-documents#get-parsed-documents) from datastore\n",
        "- Get [Chunks](https://cloud.google.com/generative-ai-app-builder/docs/parse-chunk-documents#get-processed-chunks) from datastore\n",
        "- Reconstruct the document from Chunks for visual inspection\n",
        "- Store Chunks for offline review and/or edit\n",
        "-[Bring your Own Chunks](https://cloud.google.com/generative-ai-app-builder/docs/parse-chunk-documents#bring-chunks). At the time of publishing this notebook, the BYOC feature is available under private preview. To be allowlisted for this feature, please contact your Google account team.\n",
        "\n",
        "![byoc.png](https://storage.googleapis.com/github-repo/generative-ai/search/vais-building-blocks/parsing_and_chunking_with_BYO/byoc.png)\n",
        "\n",
        "REST API is used throughout this notebook. Please consult the [official documentation](https://cloud.google.com/generative-ai-app-builder/docs/apis) for alternative ways to achieve the same goal, namely Client libraries and RPC.\n",
        "\n",
        "\n",
        "# Vertex AI Search\n",
        "Vertex AI Search (VAIS) is a fully-managed platform, powered by large language models, that lets you build AI-enabled search and recommendation experiences for your public or private websites or mobile applications\n",
        "\n",
        "VAIS can handle a diverse set of data sources including structured, unstructured, and website data, as well as data from third-party applications such as Jira, Salesforce, and Confluence.\n",
        "\n",
        "VAIS also has built-in integration with LLMs which enables you to provide answers to complex questions, grounded in your data\n",
        "\n",
        "#Using this Notebook\n",
        "If you're running outside of Colab, depending on your environment you may need to install pip packages that are included in the Colab environment by default but are not part of the Python Standard Library. Outside of Colab you'll also notice comments in code cells that look like #@something, these trigger special Colab functionality but don't change the behavior of the notebook.\n",
        "\n",
        "This tutorial uses the following Google Cloud services and resources:\n",
        "\n",
        "- Service Usage API\n",
        "- Discovery Engine\n",
        "- Google Cloud Storage Client\n",
        "\n",
        "This notebook has been tested in the following environment:\n",
        "\n",
        "- Python version = 3.10.12\n",
        "- google.cloud.storage = 2.8.0\n",
        "- google.auth = 2.27.0\n",
        "\n",
        "# Getting Started\n",
        "\n",
        "The following steps are necessary to run this notebook, no matter what notebook environment you're using.\n",
        "\n",
        "If you're entirely new to Google Cloud, [get started here](https://cloud.google.com/docs/get-started)\n",
        "\n",
        "## Google Cloud Project Setup\n",
        "\n",
        "1. [Select or create a Google Cloud project](https://console.cloud.google.com/cloud-resource-manager). When you first create an account, you get a $300 free credit towards your compute/storage costs\n",
        "2. [Make sure that billing is enabled for your project](https://cloud.google.com/billing/docs/how-to/modify-project)\n",
        "3. [Enable the Service Usage API](https://console.cloud.google.com/apis/library/serviceusage.googleapis.com)\n",
        "4. [Enable the Cloud Storage API](https://console.cloud.google.com/flows/enableapi?apiid=storage.googleapis.com)\n",
        "5. [Enable the Discovery Engine API for your project](https://console.cloud.google.com/marketplace/product/google/discoveryengine.googleapis.com)\n",
        "\n",
        "## Google Cloud Permissions\n",
        "\n",
        "Ideally you should have [Owner role](https://cloud.google.com/iam/docs/understanding-roles) for your project to run this notebook. If that is not an option, you need at least the following [roles](https://cloud.google.com/iam/docs/granting-changing-revoking-access)\n",
        "- **`roles/serviceusage.serviceUsageAdmin`** to enable APIs\n",
        "- **`roles/iam.serviceAccountAdmin`** to modify service agent permissions\n",
        "- **`roles/discoveryengine.admin`** to modify discoveryengine assets\n",
        "- **`roles/storage.objectAdmin`** to modify and delete GCS buckets"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "JwTJMRNlrOEf"
      },
      "source": [
        "#Setup Environment"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "lJFp9LUmrSOf"
      },
      "source": [
        "## Authentication\n",
        "\n",
        " If you're using Colab, run the code in the next cell. Follow the pop-ups and authenticate with an account that has access to your Google Cloud [project](https://cloud.google.com/resource-manager/docs/creating-managing-projects#identifying_projects).\n",
        "\n",
        "If you're running this notebook somewhere besides Colab, make sure your environment has the right Google Cloud access. If that's a new concept to you, consider looking into [Application Default Credentials for your local environment](https://cloud.google.com/docs/authentication/provide-credentials-adc#local-dev) and [initializing the Google Cloud CLI](https://cloud.google.com/docs/authentication/gcloud). In many cases, running `gcloud auth application-default login` in a shell on the machine running the notebook kernel is sufficient.\n",
        "\n",
        "More authentication options are discussed [here](https://cloud.google.com/docs/authentication)."
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "-OKbRWQGrc-R"
      },
      "outputs": [],
      "source": [
        "# Colab authentication.\n",
        "import sys\n",
        "\n",
        "if \"google.colab\" in sys.modules:\n",
        "    from google.colab import auth\n",
        "\n",
        "    auth.authenticate_user()\n",
        "    print(\"Authenticated\")"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "pqF-4eiwP_j8"
      },
      "outputs": [],
      "source": [
        "from google.auth import default\n",
        "from google.auth.transport.requests import AuthorizedSession\n",
        "\n",
        "creds, _ = default()\n",
        "authed_session = AuthorizedSession(creds)"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "kdQPp72R11pd"
      },
      "source": [
        "## Import Libraries"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "JeAqU1WBrXDy"
      },
      "outputs": [],
      "source": [
        "import json\n",
        "import re\n",
        "import sys\n",
        "import textwrap\n",
        "import time\n",
        "from typing import Any\n",
        "from urllib.parse import urlparse\n",
        "\n",
        "from google.auth import default\n",
        "from google.auth.transport.requests import AuthorizedSession\n",
        "from google.cloud import storage\n",
        "import requests"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "MGXEinm3q1ks"
      },
      "source": [
        "## Configure environment\n",
        "\n",
        "You can enter the ID for an existing Vertex AI Search App and Datastore to be used in this notebook.\n",
        "\n",
        "Alternatively, you can enter the desired IDs for non-existings App and Datastore and they will be created later in this notebook.\n",
        "\n",
        "Same applies to the Cloud Storage buckets to store Documents and Metadata. The Documents and Metadata can be in separate buckets, but it is advised to keep them (together with the JSONL created later in this notebook) in the same temporary bucket for the ease of cleanup.\n",
        "\n",
        "You can find more information regarding the `location` of datastores and associated limitations [here](https://cloud.google.com/generative-ai-app-builder/docs/locations#specify_a_multi-region_for_your_data_store).\n",
        "\n",
        "The location of a Datastore is set at the time of creation and it should be called appropriately to query the Datastore.\n",
        "\n",
        "`FILE_NAME_VAIS_OUTPUT` is used to upload the Chunked Document to the bucket specified."
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "_wsHOE5dl_3i"
      },
      "outputs": [],
      "source": [
        "PROJECT_ID = \"\"  # @param {type:\"string\"}\n",
        "\n",
        "# Vertex AI Search Parameters\n",
        "DATASTORE_ID = \"goog_earnings_test\"  # @param {type:\"string\"}\n",
        "LOCATION = \"global\"  # @param [\"global\", \"us\", \"eu\"] Global is preferred\n",
        "GCS_BUCKET = \"sample_earnings\"  # @param {type:\"string\"}\n",
        "FILE_NAME_VAIS_OUTPUT = \"chunked_doc_from_VAIS.json\"  # @param {type:\"string\"}"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "nwHJzjwbTlF_"
      },
      "source": [
        "# STEP 1. Create VAIS Datastore\n",
        "\n",
        "You can skip this section if you already have a datastore with your target unstructured documents ingested with [Chunk mode](https://cloud.google.com/generative-ai-app-builder/docs/parse-chunk-documents), which indexes your data as chunks to improve relevance and decrease computational load for LLMs."
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "v-soe_CgVnFb"
      },
      "source": [
        "## Helper functions to create a Datastore\n",
        "\n",
        "The datastore is created with [Chunk Mode](https://cloud.google.com/generative-ai-app-builder/docs/parse-chunk-documents) and Chunk size of 500 tokens.\n",
        "\n",
        "The documents will be processed with Layout parser (higher quality for complex documents containing elements like tables and lists) and Ancestor information (i.e. headings) is included with each Chunk. Please see [official documentation](https://cloud.google.com/generative-ai-app-builder/docs/parse-chunk-documents) for more details."
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "BG1sHc6hWH86"
      },
      "outputs": [],
      "source": [
        "def create_chunk_mode_datastore(\n",
        "    project_id: str, location: str, datastore_id: str\n",
        ") -> int:\n",
        "    \"\"\"Create a datastore with chunk mode and the more advanced layout parser\"\"\"\n",
        "    payload = {\n",
        "        \"displayName\": datastore_id,\n",
        "        \"industryVertical\": \"GENERIC\",\n",
        "        \"solutionTypes\": [\"SOLUTION_TYPE_SEARCH\"],\n",
        "        \"contentConfig\": \"CONTENT_REQUIRED\",\n",
        "        \"documentProcessingConfig\": {\n",
        "            \"chunkingConfig\": {\n",
        "                \"layoutBasedChunkingConfig\": {\n",
        "                    \"chunkSize\": 500,\n",
        "                    \"includeAncestorHeadings\": True,\n",
        "                }\n",
        "            },\n",
        "            \"defaultParsingConfig\": {\"layoutParsingConfig\": {}},\n",
        "        },\n",
        "    }\n",
        "    header = {\"X-Goog-User-Project\": project_id, \"Content-Type\": \"application/json\"}\n",
        "    es_endpoint = f\"https://discoveryengine.googleapis.com/v1/projects/{project_id}/locations/{location}/collections/default_collection/dataStores?dataStoreId={datastore_id}\"\n",
        "    response = authed_session.post(\n",
        "        es_endpoint, data=json.dumps(payload), headers=header\n",
        "    )\n",
        "    if response.status_code == 200:\n",
        "        print(f\"The creation of Datastore {datastore_id} is initiated.\")\n",
        "        print(\"It may take a few minutes for the Datastore to become available\")\n",
        "    else:\n",
        "        print(f\"Failed to create Datastore {datastore_id}\")\n",
        "        print(response.json())\n",
        "    return response.status_code"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "w2q8UZxwRd1m"
      },
      "source": [
        "## Helper functions to issue basic search on a Datastore"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "98bc0zgCWhqP"
      },
      "outputs": [],
      "source": [
        "def search_by_datastore(\n",
        "    project_id: str, location: str, datastore_id: str, query: str\n",
        ") -> requests.Response:\n",
        "    \"\"\"Searches a datastore using the provided query.\"\"\"\n",
        "    response = authed_session.post(\n",
        "        f\"https://discoveryengine.googleapis.com/v1/projects/{project_id}/locations/{location}/collections/default_collection/dataStores/{datastore_id}/servingConfigs/default_search:search\",\n",
        "        headers={\n",
        "            \"Content-Type\": \"application/json\",\n",
        "        },\n",
        "        json={\"query\": query, \"pageSize\": 1},\n",
        "    )\n",
        "    return response"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "fJ3ULZOeR-E2"
      },
      "source": [
        "## Helper functions to check whether or not a Datastore already exists"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "ShppuvcxWBut"
      },
      "outputs": [],
      "source": [
        "def datastore_exists(project_id: str, location: str, datastore_id: str) -> bool:\n",
        "    \"\"\"Check if a datastore exists.\"\"\"\n",
        "    response = search_by_datastore(project_id, location, datastore_id, \"test\")\n",
        "    status_code = response.status_code\n",
        "    if status_code == 200:\n",
        "        return True\n",
        "    if status_code == 404:\n",
        "        return False\n",
        "    raise Exception(f\"Error: {status_code}\")"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "NnCzxjV9SVO_"
      },
      "source": [
        "## Create a Datastore with the provided ID if it doesn't exist"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "nBnAc1NIV59z"
      },
      "outputs": [],
      "source": [
        "# Create Chunk mode Datastore if it doesn't exist\n",
        "if datastore_exists(PROJECT_ID, LOCATION, DATASTORE_ID):\n",
        "    print(f\"Datastore {DATASTORE_ID} already exists.\")\n",
        "else:\n",
        "    create_chunk_mode_datastore(PROJECT_ID, LOCATION, DATASTORE_ID)"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "MF07QdwxW_1n"
      },
      "source": [
        "## [Optional] Check if the Datastore is created successfully\n",
        "\n",
        "\n",
        "The Datastore is polled to track when it becomes available.\n",
        "\n",
        "This may take a few minutes after the datastore creation is initiated"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "S7xR3uX8XEnH"
      },
      "outputs": [],
      "source": [
        "while not datastore_exists(PROJECT_ID, LOCATION, DATASTORE_ID):\n",
        "    print(f\"Datastore {DATASTORE_ID} is still being created.\")\n",
        "    time.sleep(30)\n",
        "print(f\"Datastore {DATASTORE_ID} is created successfully.\")"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "Gxyl1Fs2XXBp"
      },
      "source": [
        "# STEP 2. Import sample document into VAIS Datastore"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "IRaL_VoXTnIw"
      },
      "source": [
        "## Create a GCS bucket with sample document(s)\n",
        "\n",
        "This step is only needed for the purpose of this demo. For the real use case you will need to upload your actual documents to a GCS bucket\n",
        "\n",
        "Here, we download [Alphabet's 2024 Q2 Earnings Release](https://abc.xyz/assets/19/e4/3dc1d4d6439c81206370167db1bd/2024q2-alphabet-earnings-release.pdf) as a sample document."
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "HHVR3KkzT2H4"
      },
      "outputs": [],
      "source": [
        "def create_gcs_bucket_and_download_files(\n",
        "    project_id: str, bucket_name: str, file_urls: list[str]\n",
        ") -> None:\n",
        "    \"\"\"\n",
        "    Creates a GCS bucket (if it doesn't exist) and downloads files from specified URLs.\n",
        "\n",
        "    Args:\n",
        "        project_id (str): Your Google Cloud Project ID.\n",
        "        bucket_name (str): The name of the GCS bucket (e.g., \"my-documents-bucket\").\n",
        "        file_urls (list): A list of URLs to files you want to download.\n",
        "    \"\"\"\n",
        "\n",
        "    storage_client = storage.Client(project=project_id)\n",
        "    bucket = storage_client.bucket(bucket_name)\n",
        "\n",
        "    if not bucket.exists():\n",
        "        bucket = storage_client.create_bucket(bucket_name)\n",
        "\n",
        "        print(f\"Bucket {bucket_name} created.\")\n",
        "\n",
        "    for url in file_urls:\n",
        "        file_name = url.split(\"/\")[-1]\n",
        "        print(f\"Downloading: {file_name}\")\n",
        "\n",
        "        try:\n",
        "            response = requests.get(url)\n",
        "            response.raise_for_status()  # Raise an exception for HTTP errors\n",
        "\n",
        "            blob = bucket.blob(file_name)\n",
        "            blob.upload_from_string(\n",
        "                response.content,\n",
        "                content_type=\"application/pdf\",  # Explicitly set the content type\n",
        "            )\n",
        "            print(f\"Uploaded: {file_name}\")\n",
        "        except requests.exceptions.RequestException as e:\n",
        "            print(f\"Error downloading {file_name}: {e}\")\n",
        "\n",
        "\n",
        "file_urls = [\n",
        "    \"https://abc.xyz/assets/19/e4/3dc1d4d6439c81206370167db1bd/2024q2-alphabet-earnings-release.pdf\"\n",
        "]\n",
        "\n",
        "create_gcs_bucket_and_download_files(PROJECT_ID, GCS_BUCKET, file_urls)"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "js_EG6U2XaKL"
      },
      "source": [
        "## Helper function to import documents into a VAIS datastore\n",
        "\n",
        "This helper function is used to import the documents in a GCS folder into VAIS\n",
        "\n",
        "NOTE: The [\"dataSchema\"](https://cloud.google.com/generative-ai-app-builder/docs/reference/rest/v1/GcsSource?hl=en) should be specified as \"content\". This allows us to ingest PDF files directly. The default \"dataSchema\" is \"document\" which expects JSONL files(s) in `gcs_uri`. This option is most useful when we want to include Metadata. See [documentation](https://cloud.google.com/generative-ai-app-builder/docs/prepare-data?hl=en#storage-unstructured) for more details.\n",
        "\n",
        "The process is done asynchronously, and the request returns an instance of a \"Long running Operation\".\n",
        "\n",
        "For a small corpus like the one we are experimenting with in this notebook, the process takes in the order of xx minutes."
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "a2EnwXpOXlj3"
      },
      "outputs": [],
      "source": [
        "def import_documents_from_gcs(\n",
        "    project_id: str, location: str, datastore_id: str, gcs_uri: str\n",
        ") -> str:\n",
        "    \"\"\"Imports unstructured documents from a GCS bucket.\"\"\"\n",
        "    payload = {\n",
        "        \"reconciliationMode\": \"INCREMENTAL\",\n",
        "        \"gcsSource\": {\"inputUris\": [gcs_uri], \"dataSchema\": \"content\"},\n",
        "    }\n",
        "    header = {\"Content-Type\": \"application/json\"}\n",
        "    es_endpoint = f\"https://discoveryengine.googleapis.com/v1/projects/{project_id}/locations/{location}/collections/default_collection/dataStores/{datastore_id}/branches/default_branch/documents:import\"\n",
        "    response = authed_session.post(\n",
        "        es_endpoint, data=json.dumps(payload), headers=header\n",
        "    )\n",
        "    print(f\"--{response.json()}\")\n",
        "    return response.json()[\"name\"]"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "NrypDv7kXtLf"
      },
      "source": [
        "## Importing sample documents into the Chunk Mode Datastore"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "qzKBkp9bXxZT"
      },
      "outputs": [],
      "source": [
        "chunk_mode_import_lro = import_documents_from_gcs(\n",
        "    project_id=PROJECT_ID,\n",
        "    location=LOCATION,\n",
        "    datastore_id=DATASTORE_ID,\n",
        "    gcs_uri=f\"gs://{GCS_BUCKET}/*\",\n",
        ")"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "0SmHf6ZfX3GI"
      },
      "source": [
        "## [Optional] Check the status of document import for the Chunk Mode Datastore\n",
        "Optionally check the status of the long running operation for the import job. You can check this in the UI as well by looking at the \"activity\" tab of the corresponding Datastore"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "L_ID_6ozX5-u"
      },
      "outputs": [],
      "source": [
        "while True:\n",
        "    response = authed_session.get(\n",
        "        f\"https://discoveryengine.googleapis.com/v1/{chunk_mode_import_lro}\",\n",
        "    )\n",
        "    try:\n",
        "        status = response.json()[\"done\"]\n",
        "        if status:\n",
        "            print(\"Import completed!\")\n",
        "            break\n",
        "    except KeyError:\n",
        "        print(\"Import in progress.\")\n",
        "        time.sleep(60)"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "ndoKhd9lguyc"
      },
      "source": [
        "#Helper Functions for formatting and ease of visual inspection\n",
        "The following helper functions are used to reconstruct a document from its chunks and to show them in a human-friendly manner.\n",
        "\n",
        "These functions are not particularly related to VAIS and you do not need to worry about their details to understand the flow of this notebook"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "XLT8IvSMXG58"
      },
      "source": [
        "##Helper function to beautify JSON outputs"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "kkZiVM8nR53S"
      },
      "outputs": [],
      "source": [
        "def parse_and_print_json(data: dict[str, Any]) -> dict[str, Any] | None:\n",
        "    \"\"\"\n",
        "    Recursively parses and structures JSON data into a more readable dictionary format,\n",
        "    handling nested dictionaries.\n",
        "\n",
        "    Args:\n",
        "        data (dict): The dictionary potentially containing JSON strings at any level.\n",
        "\n",
        "    Returns:\n",
        "        dict or None: The original dictionary with JSON strings parsed into dictionaries,\n",
        "                or None if there's an error during JSON decoding.\n",
        "    \"\"\"\n",
        "\n",
        "    for key, value in data.items():\n",
        "        if isinstance(value, str) and value.startswith(\"{\"):  # Check for JSON string\n",
        "            try:\n",
        "                data[key] = json.loads(\n",
        "                    value\n",
        "                )  # Parse and replace with the parsed dictionary\n",
        "            except json.JSONDecodeError as e:\n",
        "                print(f\"Error decoding JSON in key '{key}': {e}\")\n",
        "                return None\n",
        "        elif isinstance(value, dict):  # Recurse into nested dictionaries\n",
        "            result = parse_and_print_json(value)\n",
        "            if result is None:  # If an error occurred during recursion, propagate it\n",
        "                return None\n",
        "\n",
        "    return data\n",
        "\n",
        "\n",
        "def print_json(data: dict[str, Any]) -> dict[str, Any]:\n",
        "    \"\"\"\n",
        "    Structures the JSON data into a more readable dictionary format\n",
        "\n",
        "    Args:\n",
        "        data (dict): The parsed JSON data as a dictionary.\n",
        "\n",
        "    Returns:\n",
        "        dict: The structured JSON data\n",
        "    \"\"\"\n",
        "    output = {}\n",
        "\n",
        "    for key, value in data.items():\n",
        "        if isinstance(value, dict):\n",
        "            output[key] = print_json(value)\n",
        "        elif isinstance(value, list):\n",
        "            output[key] = [\n",
        "                print_json(item) if isinstance(item, dict) else item for item in value\n",
        "            ]\n",
        "        else:\n",
        "            output[key] = value\n",
        "\n",
        "    return output"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "ZPZl6Qiag2ML"
      },
      "source": [
        "##Helper function to reconstruct a document from chunks\n",
        "\n",
        "Stitch chunks together to reconstruct the document while including pointers for chunk start and end."
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "yMa-PpoxhVEa"
      },
      "outputs": [],
      "source": [
        "def reconstruct_document(chunked_document: dict[str, Any]) -> str:\n",
        "    \"\"\"Reconstructs a document from its chunks.\"\"\"\n",
        "    reconstructed_document = \"\"\n",
        "    for chunk in chunked_document[\"jsonData\"][\"chunks\"]:\n",
        "        reconstructed_document += \"Start of chunk: \" + chunk[\"id\"] + \"\\n\\n\"\n",
        "        reconstructed_document += chunk[\"content\"]\n",
        "        reconstructed_document += \"\\n\\nEnd of chunk: \" + chunk[\"id\"] + \"\\n\\n\"\n",
        "\n",
        "    return reconstructed_document"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "d1WK0DUEjd39"
      },
      "source": [
        "## Helper function to beautify a Markdown Table\n",
        "\n",
        "Takes the markdown table from chunks and makes it more human readable using appropriate column widths using pipes and horizontal separators."
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "J5VOpJyKhDdP"
      },
      "outputs": [],
      "source": [
        "def format_markdown_table(table: str, max_cell_width: int = 15) -> str:\n",
        "    \"\"\"Formats a poorly formatted Markdown table with aligned pipes,\n",
        "    horizontal separators, and cell value wrapping.\n",
        "\n",
        "    Args:\n",
        "        table: A string containing the poorly formatted Markdown table.\n",
        "        max_cell_width: The maximum allowed width for each cell.\n",
        "\n",
        "    Returns:\n",
        "        A string containing the nicely formatted Markdown table with\n",
        "        wrapped cell values.\n",
        "    \"\"\"\n",
        "\n",
        "    # Split the table into rows.\n",
        "    rows = table.strip().split(\"\\n\")\n",
        "\n",
        "    # Find the actual header row (skipping rows with only hyphens)\n",
        "    header_row_index = next(\n",
        "        (\n",
        "            i\n",
        "            for i, row in enumerate(rows)\n",
        "            if not all(cell.strip() == \"-\" for cell in row.strip().split(\"|\")[1:-1])\n",
        "        ),\n",
        "        0,  # Default to the first row if no suitable header is found\n",
        "    )\n",
        "\n",
        "    # Split each row into cells, ensuring empty cells are accounted for\n",
        "    cells_per_row = [\n",
        "        [\n",
        "            cell.strip() for cell in row.strip().split(\"|\")[1:-1]\n",
        "        ]  # Remove leading and trailing pipes before splitting\n",
        "        for row in rows\n",
        "    ]\n",
        "\n",
        "    # Determine the number of columns, considering both header and data rows\n",
        "    num_columns = max(len(row) for row in cells_per_row)\n",
        "\n",
        "    # Determine the maximum width of each column, considering the max_cell_width limit.\n",
        "    column_widths = [\n",
        "        min(\n",
        "            max(\n",
        "                len(cells_per_row[row_index][col_index])\n",
        "                for row_index in range(len(cells_per_row))\n",
        "                if col_index < len(cells_per_row[row_index])\n",
        "            ),  # Handle rows with fewer columns\n",
        "            max_cell_width,\n",
        "        )\n",
        "        for col_index in range(num_columns)\n",
        "    ]\n",
        "\n",
        "    # Set a minimum column width to prevent 0 width\n",
        "    min_column_width = 3  # Or any other reasonable minimum\n",
        "    column_widths = [max(width, min_column_width) for width in column_widths]\n",
        "\n",
        "    # Function to wrap cell values if they exceed the column width\n",
        "    def wrap_cell_value(cell_value, width):\n",
        "        wrapped_lines = textwrap.wrap(cell_value, width=width)\n",
        "        return wrapped_lines\n",
        "\n",
        "    # Format the header row, potentially adding empty cells if needed\n",
        "    formatted_header_cells = [\n",
        "        wrap_cell_value(cell, column_widths[i])\n",
        "        for i, cell in enumerate(cells_per_row[header_row_index])\n",
        "    ]\n",
        "    formatted_header_cells += [[\"\"]] * (\n",
        "        num_columns - len(formatted_header_cells)\n",
        "    )  # Add empty cells if needed\n",
        "    max_lines_in_header = max(len(lines) for lines in formatted_header_cells)\n",
        "    formatted_header_rows = []\n",
        "    for line_index in range(max_lines_in_header):\n",
        "        formatted_header_rows.append(\n",
        "            \"| \"\n",
        "            + \" | \".join(\n",
        "                (cell[line_index] if line_index < len(cell) else \"\")\n",
        "                + \" \"\n",
        "                * (\n",
        "                    column_widths[i]\n",
        "                    - len(cell[line_index] if line_index < len(cell) else \"\")\n",
        "                )\n",
        "                for i, cell in enumerate(formatted_header_cells)\n",
        "            )\n",
        "            + \" |\"\n",
        "        )\n",
        "\n",
        "    formatted_rows = formatted_header_rows\n",
        "\n",
        "    # Format the separator row beneath the header.\n",
        "    formatted_rows.append(\n",
        "        \"|\" + \"|\".join(\"-\" * (width + 2) for width in column_widths) + \"|\"\n",
        "    )\n",
        "\n",
        "    # Format the remaining rows (excluding the hyphen-only row if present), adding separators after each row\n",
        "    for row_index, row in enumerate(cells_per_row):\n",
        "        if row_index != header_row_index and not all(\n",
        "            cell.strip() == \"-\" for cell in row\n",
        "        ):  # Skip header and hyphen-only rows\n",
        "            # Pad row with empty cells if needed\n",
        "            padded_row = row + [\"\"] * (num_columns - len(row))\n",
        "            wrapped_cells = [\n",
        "                wrap_cell_value(cell, column_widths[i])\n",
        "                for i, cell in enumerate(padded_row)\n",
        "            ]\n",
        "            max_lines_in_row = max(len(lines) for lines in wrapped_cells)\n",
        "            for line_index in range(max_lines_in_row):\n",
        "                formatted_row = (\n",
        "                    \"| \"\n",
        "                    + \" | \".join(\n",
        "                        (cell[line_index] if line_index < len(cell) else \"\")\n",
        "                        + \" \"\n",
        "                        * (\n",
        "                            column_widths[i]\n",
        "                            - len(cell[line_index] if line_index < len(cell) else \"\")\n",
        "                        )\n",
        "                        for i, cell in enumerate(wrapped_cells)\n",
        "                    )\n",
        "                    + \" |\"\n",
        "                )\n",
        "                formatted_rows.append(formatted_row)\n",
        "\n",
        "            # Add separator row after each data row (except the last one)\n",
        "            if row_index < len(cells_per_row) - 1:\n",
        "                formatted_rows.append(\n",
        "                    \"|\" + \"|\".join(\"-\" * (width + 2) for width in column_widths) + \"|\"\n",
        "                )\n",
        "\n",
        "    # Join the formatted rows into a single string.\n",
        "    return \"\\n\".join(formatted_rows)"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "I-50mUGuiC_2"
      },
      "source": [
        "## Helper function to beautify all Markdown Tables\n",
        "\n",
        "This function goes over the whole reconstructed document and replaces all markdown tables with their beautified versions"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "GKxnoDn9hHA1"
      },
      "outputs": [],
      "source": [
        "def format_chunked_document(text: str) -> str:\n",
        "    \"\"\"Identifies markdown tables within a string, formats them, and replaces the original instances.\n",
        "\n",
        "    Args:\n",
        "        text: The input string potentially containing multiple markdown tables.\n",
        "\n",
        "    Returns:\n",
        "        The modified string with formatted markdown tables replacing the original ones.\n",
        "    \"\"\"\n",
        "\n",
        "    # Define the pattern to match markdown table instances\n",
        "    table_pattern = r\"_START_OF_TABLE_\\nTABLE_IN_MARKDOWN:\\n(.*?)\\n_END_OF_TABLE_\"\n",
        "\n",
        "    # Find all matches of the pattern within the text\n",
        "    matches = re.findall(\n",
        "        table_pattern, text, re.DOTALL\n",
        "    )  # re.DOTALL allows '.' to match newlines\n",
        "\n",
        "    # Process each matched table and replace it in the original text\n",
        "    for table_content in matches:\n",
        "        formatted_table = format_markdown_table(table_content)\n",
        "        # Remove the extra newline before inserting the formatted table\n",
        "        text = text.replace(\n",
        "            f\"_START_OF_TABLE_\\nTABLE_IN_MARKDOWN:\\n{table_content}\\n_END_OF_TABLE_\",\n",
        "            \"\\n\" + formatted_table + \"\\n\\n\",\n",
        "            1,\n",
        "        )\n",
        "\n",
        "    return text"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "nkWefjvj-NGn"
      },
      "source": [
        "# STEP 3. Get Parsed and Chunked Document\n",
        "\n",
        "In this section we visually review Parsed and Chunked versions of a document"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "jspb2xq2ab_F"
      },
      "source": [
        "##List all documents in a Datastore\n",
        "Get a list of all documents in the datastore. You can then select the ID for the document of interest."
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "281LTsCrabCX"
      },
      "outputs": [],
      "source": [
        "def list_documents_datastore(\n",
        "    project_id: str, location: str, data_store_id: str\n",
        ") -> list[dict[str, str]] | None:\n",
        "    \"\"\"Lists documents in a specified data store using the REST API.\n",
        "\n",
        "    Args:\n",
        "        project_id: The ID of your Google Cloud project.\n",
        "        location: The location of your data store.\n",
        "              Values: \"global\", \"us\", \"eu\"\n",
        "        data_store_id: The ID of the datastore.\n",
        "\n",
        "    Returns:\n",
        "        The JSON response containing the list of documents, or None if an error occurs.\n",
        "    \"\"\"\n",
        "\n",
        "    base_url = (\n",
        "        f\"{location}-discoveryengine.googleapis.com\"\n",
        "        if location != \"global\"\n",
        "        else \"discoveryengine.googleapis.com\"\n",
        "    )\n",
        "    url = f\"https://{base_url}/v1alpha/projects/{project_id}/locations/{location}/collections/default_collection/dataStores/{data_store_id}/branches/default_branch/documents\"\n",
        "\n",
        "    try:\n",
        "        # Assuming 'authed_session' is available and properly configured for authentication\n",
        "        response = authed_session.get(url)\n",
        "        response.raise_for_status()  # Raise an exception for bad status codes\n",
        "        documents = response.json()\n",
        "        print(\n",
        "            f\"Successfully retrieved {len(documents.get('documents', []))} document(s).\\n\"\n",
        "        )\n",
        "        return [\n",
        "            {\"id\": document[\"id\"], \"uri\": document[\"content\"][\"uri\"]}\n",
        "            for document in documents.get(\"documents\", [])\n",
        "        ]\n",
        "\n",
        "    except requests.exceptions.RequestException as e:\n",
        "        print(f\"Error listing documents: {e}\")\n",
        "        return None\n",
        "\n",
        "\n",
        "list_documents_datastore(PROJECT_ID, LOCATION, DATASTORE_ID)\n",
        "DOCUMENT_ID = list_documents_datastore(PROJECT_ID, LOCATION, DATASTORE_ID)[0][\n",
        "    \"id\"\n",
        "]  # provisionally take the first document in the datastore as the document we want to analyze"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "OMFcf4zPiZ6N"
      },
      "source": [
        "##Select the Document of interest\n",
        "By running the previous block, the Document ID of interest will be pre-set to the first document in the Datastore.\n",
        "\n",
        "You can update as needed.\n"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "h6xUzPqQcR3o"
      },
      "outputs": [],
      "source": [
        "DOCUMENT_ID = \"\"  # @param {type:\"string\"}"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "c7h6PhYr-R-F"
      },
      "source": [
        "## Get the Parsed Document\n",
        "\n",
        "Get the parsed version of the document of interest.\n",
        "\n",
        "The parsed document is not really human readable. However it might be useful to troubleshoot downstream issues such as text element identification or cell block detection.\n"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "HJ_mtgCxmNv7"
      },
      "outputs": [],
      "source": [
        "def get_parsed_document(\n",
        "    project_id: str, data_store_id: str, document_id: str\n",
        ") -> dict[str, Any] | None:\n",
        "    \"\"\"Retrieves a parsed document in JSON from Vertex AI Agent Builder.\"\"\"\n",
        "    \"\"\"Only applicable for data stores with Chunking config set.\"\"\"\n",
        "\n",
        "    # Get authentication token (replace with your method)\n",
        "\n",
        "    base_url = \"https://discoveryengine.googleapis.com/v1alpha\"\n",
        "    url = f\"{base_url}/projects/{project_id}/locations/global/collections/default_collection/dataStores/{data_store_id}/branches/0/documents/{document_id}:getProcessedDocument?processed_document_type=PARSED_DOCUMENT\"\n",
        "    response = authed_session.get(url)\n",
        "\n",
        "    if response.status_code == 200:\n",
        "        parsed_document = parse_and_print_json(response.json())\n",
        "        return parsed_document\n",
        "    else:\n",
        "        print(f\"Error: {response.status_code}, {response.text}\")\n",
        "        return None\n",
        "\n",
        "\n",
        "parsed_document = get_parsed_document(PROJECT_ID, DATASTORE_ID, DOCUMENT_ID)\n",
        "parsed_document"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "heR9Ebfix3hD"
      },
      "source": [
        "##Get the Chunked Document\n",
        "\n",
        "Get Chunks from the document in JSON format"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "aHGruIcax7ij"
      },
      "outputs": [],
      "source": [
        "def get_chunked_document(\n",
        "    project_id: str, data_store_id: str, document_id: str\n",
        ") -> dict[str, Any] | None:\n",
        "    \"\"\"Retrieves a chunked document in JSON from Vertex AI Agent Builder.\"\"\"\n",
        "    \"\"\"Only applicable for data stores with Chunking config set.\"\"\"\n",
        "\n",
        "    # Get authentication token (replace with your method)\n",
        "\n",
        "    base_url = \"https://discoveryengine.googleapis.com/v1alpha\"\n",
        "    url = f\"{base_url}/projects/{project_id}/locations/global/collections/default_collection/dataStores/{data_store_id}/branches/0/documents/{document_id}:getProcessedDocument?processed_document_type=CHUNKED_DOCUMENT\"\n",
        "    response = authed_session.get(url)\n",
        "\n",
        "    if response.status_code == 200:\n",
        "        chunked_document = parse_and_print_json(response.json())\n",
        "        return chunked_document\n",
        "    else:\n",
        "        print(f\"Error: {response.status_code}, {response.text}\")\n",
        "        return None\n",
        "\n",
        "\n",
        "chunked_document = get_chunked_document(PROJECT_ID, DATASTORE_ID, DOCUMENT_ID)\n",
        "chunked_document"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "t1gLR_5cN-3-"
      },
      "source": [
        "## Visually review the Chunked document\n",
        "\n",
        "Visually review to spot issues with the chunked document.\n",
        "\n",
        "The chunks from JSON object are stacked together first, and beautified later for ease of human reviewing.\n",
        "\n",
        "Helper functions defined earlier in this notebook are used here.\n",
        "\n",
        "For offline reviewing, you can export the string `chunked_document` to your desired format (e.g. PDF)\n"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "i6Oy79QnFqHm"
      },
      "outputs": [],
      "source": [
        "reconstructed_document = reconstruct_document(chunked_document)\n",
        "processed_string = format_chunked_document(reconstructed_document)\n",
        "print(processed_string)"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "68QbshVtqTXj"
      },
      "source": [
        "The beautified chunked version of the sample document use in this notebook will begin like the screenshot below:\n",
        "\n",
        "![chunked_document.png](https://storage.googleapis.com/github-repo/generative-ai/search/vais-building-blocks/parsing_and_chunking_with_BYO/chunked_document.png)"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "65AKofzIuU2t"
      },
      "source": [
        "## [Optional] Upload Chunks to GCS Bucket\n",
        "\n",
        "Upload chunked document for offline review and edit.\n",
        "\n",
        "You can always transform JSON to your preferred formats (e.g. CSV, XLSX) before exporting."
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "e95_7EcOs1Vt"
      },
      "outputs": [],
      "source": [
        "def upload_json_to_gcs(\n",
        "    bucket_name: str, file_name: str, json_data: dict[str, Any] | list[Any]\n",
        ") -> None:\n",
        "    \"\"\"Uploads a JSON variable to a GCS bucket as a file.\n",
        "\n",
        "    Args:\n",
        "        bucket_name: The name of the GCS bucket (must start with 'gs://' and end with '/').\n",
        "        file_name: The desired name of the JSON file within the bucket.\n",
        "        json_data: The JSON data to be uploaded (Python dictionary or list).\n",
        "\n",
        "    Raises:\n",
        "        ValueError: If the bucket_name format is invalid.\n",
        "    \"\"\"\n",
        "\n",
        "    if not bucket_name.startswith(\"gs://\") or not bucket_name.endswith(\"/\"):\n",
        "        raise ValueError(\n",
        "            \"Invalid GCS path format. Must start with 'gs://' and end with '/'. \"\n",
        "            f\"Received: '{bucket_name}'\"\n",
        "        )\n",
        "\n",
        "    storage_client = storage.Client(\n",
        "        project=PROJECT_ID\n",
        "    )  # Assuming PROJECT_ID is defined\n",
        "\n",
        "    parsed_path = urlparse(bucket_name)\n",
        "    bucket_name = parsed_path.netloc\n",
        "\n",
        "    bucket = storage_client.bucket(bucket_name)\n",
        "    blob = bucket.blob(file_name)\n",
        "\n",
        "    # Convert the JSON data to a string\n",
        "    json_string = json.dumps(json_data, indent=2)\n",
        "\n",
        "    # Upload the JSON string as the file contents\n",
        "    blob.upload_from_string(json_string, content_type=\"application/json\")\n",
        "\n",
        "    print(\n",
        "        f\"JSON data uploaded to https://storage.mtls.cloud.google.com/{bucket_name}/{file_name}\\n\"\n",
        "    )\n",
        "\n",
        "\n",
        "upload_json_to_gcs(\n",
        "    \"gs://\" + GCS_BUCKET + \"/\", FILE_NAME_VAIS_OUTPUT, chunked_document[\"jsonData\"]\n",
        ")"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "9yM5bnVvvOxi"
      },
      "source": [
        "# STEP 4. [Needs Allowlisting] Bring Your Own Chunks (BYOC)\n",
        "\n",
        "This section describes how to [bring your own chunks](https://cloud.google.com/generative-ai-app-builder/docs/parse-chunk-documents#bring-chunks) into VAIS.\n",
        "\n",
        "The chunks can be completely generated by you, or you can take chunks generated by VAIS and modify them. Examples of the latter could be augmenting with additional context, or even batch processing to fix systematic issues with a certain document template like heading detection.\n",
        "\n",
        "Note that chunks should comply with the token limit specified at the time of creating the Datastore.\n",
        "\n",
        "At the time of publishing this notebook, the BYOC feature is available under private preview. To be allowlisted for this feature, please contact your Google account team.\n",
        "\n",
        "Additional Notes:\n",
        "\n",
        "1. This notebook showcases a particular use case of BYOC where VAIS is used for the initial parsing and chunking as well. In most cases for BYOC parsing and chunking is done outside VAIS and the chunks are brought into VAIS using BYOC.\n",
        "\n",
        "2. A document ingested using this feature is of type JSON and is treated separately from the original document used to generate the chunks (assuming that part is done in VAIS as well). To avoid duplicates, the original file needs to be removed after the BYOC document is ingested. You can use [this](https://github.com/GoogleCloudPlatform/applied-ai-engineering-samples/blob/main/genai-on-vertex-ai/vertex_ai_search/inline_ingestion_of_documents.ipynb) notebook to see how to delete a specific document via API.\n",
        "\n",
        "3. If you use VAIS to do the initial chunking, the `document metadata` will reference the original source document and its title. `document metadata` field in the chunked document is **only** used for retrieval purposes. You can modify that field as desired if you want to leverage it for other purposes.\n"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "fb8JrGgnxP73"
      },
      "source": [
        "## Function to import Chunks"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "A39akvzx89cA"
      },
      "outputs": [],
      "source": [
        "def upload_document_chunks(project_id: str, data_store_id: str, uri_path: str) -> None:\n",
        "    \"\"\"Uploads chunks of a document to a Vertex AI data store.\"\"\"\n",
        "\n",
        "    # Get authentication token using gcloud\n",
        "\n",
        "    base_url = \"https://discoveryengine.googleapis.com/v1alpha\"\n",
        "    url = f\"{base_url}/projects/{project_id}/locations/global/collections/default_collection/dataStores/{data_store_id}/branches/default_branch/documents:import\"\n",
        "\n",
        "    # Prepare the request payload\n",
        "    # header = {\"Content-Type\": \"application/json\"}\n",
        "    payload = {\n",
        "        \"reconciliationMode\": \"INCREMENTAL\",\n",
        "        \"gcsSource\": {\"inputUris\": uri_path, \"dataSchema\": \"content\"},\n",
        "    }\n",
        "\n",
        "    response = authed_session.post(url=url, json=payload)\n",
        "\n",
        "    if response.status_code == 200:\n",
        "        print(\"Chunked file uploaded successfully!\")\n",
        "    else:\n",
        "        print(f\"Error uploading chunked file: {response.status_code}, {response.text}\")"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "EjMOrPkty7PR"
      },
      "source": [
        "## Import Chunks\n",
        "\n",
        "Define the file name with chunks to be imported and run the function to actually import it.\n",
        "\n",
        "The formatting of the file should be same as `jsonData` field in the Chunked document.\n",
        "\n",
        "For the sake of quick testing you use the exported chunked document here and reimport it into VAIS."
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "Klnz7UCsyWzF"
      },
      "outputs": [],
      "source": [
        "FILE_NAME_TO_IMPORT = \"chunked_doc_to_import.json\"  # @param {type:\"string\"}\n",
        "upload_document_chunks(\n",
        "    PROJECT_ID, DATASTORE_ID, \"gs://\" + GCS_BUCKET + \"/\" + FILE_NAME_TO_IMPORT\n",
        ")"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "QVJ5iIf4cuOC"
      },
      "source": [
        "## Visually review BYO Chunked document\n",
        "\n",
        "Follow the instructions under step 3 to\n",
        "\n",
        "- List documents in the datastore\n",
        "- Identify the BYO chunked document\n",
        "- Get the chunked document, and use helper functions to stack the chunks together and visually review it.\n",
        "\n",
        "The screenshot below shows what you can get by slightly modifying the chunked document by VAIS and ingesting it back into VAIS (Note that the first line is manually added to the first chunk).\n",
        "\n",
        "![byoc.png](https://storage.googleapis.com/github-repo/generative-ai/search/vais-building-blocks/parsing_and_chunking_with_BYO/byoc.png)"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "b_9s1JT6AS7o"
      },
      "source": [
        "#Cleanup\n",
        "Clean up resources created in this notebook.\n",
        "\n",
        "Set `DELETE_RESOURCES` flag to `True` to delete resources."
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "9yeinaBzeok9"
      },
      "outputs": [],
      "source": [
        "DELETE_RESOURCES = False"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "sS7tVxEgAXGA"
      },
      "source": [
        "## Clean up GCS bucket\n",
        "\n",
        "❗❗❗ Only run the below cells if you created a new bucket just for this notebook ❗❗❗\n"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "dAOJ46asAuoa"
      },
      "outputs": [],
      "source": [
        "def empty_bucket(bucket_name: str) -> None:\n",
        "    \"\"\"Deletes all objects in the specified GCS bucket.\"\"\"\n",
        "    client = storage.Client()\n",
        "    bucket = client.get_bucket(bucket_name)\n",
        "\n",
        "    blobs = bucket.list_blobs()  # List all blobs (objects)\n",
        "    for blob in blobs:\n",
        "        blob.delete()  # Delete each blob\n",
        "\n",
        "    print(f\"Bucket {bucket_name} emptied.\")"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "DoIjtak6Ab7O"
      },
      "outputs": [],
      "source": [
        "if DELETE_RESOURCES:\n",
        "    # Empty the bucket by deleting all files in it\n",
        "    empty_bucket(GCS_BUCKET)\n",
        "\n",
        "    # Create a client object\n",
        "    client = storage.Client(project=PROJECT_ID)\n",
        "\n",
        "    # Get the bucket object\n",
        "    bucket = client.get_bucket(GCS_BUCKET)\n",
        "\n",
        "    # Delete the bucket\n",
        "    bucket.delete()\n",
        "\n",
        "    print(f\"Bucket {GCS_BUCKET} deleted successfully.\")"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "L0aU2DdTckUo"
      },
      "source": [
        "## Delete the Datastore\n",
        "Delete the Datastore if you no longer need it\n",
        "\n",
        "Alternatively you can follow [these instructions](https://console.cloud.google.com/gen-app-builder/data-stores) to delete a Datastore from the UI"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "pBKcL_oicjxL"
      },
      "outputs": [],
      "source": [
        "if DELETE_RESOURCES:\n",
        "    response = authed_session.delete(\n",
        "        f\"https://discoveryengine.googleapis.com/v1alpha/projects/{PROJECT_ID}/locations/{LOCATION}/collections/default_collection/dataStores/{DATASTORE_ID}\",\n",
        "        headers={\"X-Goog-User-Project\": PROJECT_ID},\n",
        "    )\n",
        "\n",
        "    print(response.json())"
      ]
    }
  ],
  "metadata": {
    "colab": {
      "name": "parsing_and_chunking_with_BYO.ipynb",
      "toc_visible": true
    },
    "kernelspec": {
      "display_name": "Python 3",
      "name": "python3"
    }
  },
  "nbformat": 4,
  "nbformat_minor": 0
}
